Homework 7
- Doing Challenge 4 from Data and Graphics Challenge
InĀ [21]:
import altair as alt
import pandas as pd
InĀ [22]:
countryCodes = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
countryCodes.head()
Out[22]:
| name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AF | AFG | 4 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 1 | Ć land Islands | AX | ALA | 248 | ISO 3166-2:AX | Europe | Northern Europe | NaN | 150.0 | 154.0 | NaN |
| 2 | Albania | AL | ALB | 8 | ISO 3166-2:AL | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| 3 | Algeria | DZ | DZA | 12 | ISO 3166-2:DZ | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN |
| 4 | American Samoa | AS | ASM | 16 | ISO 3166-2:AS | Oceania | Polynesia | NaN | 9.0 | 61.0 | NaN |
InĀ [31]:
gasolinePrices = pd.read_csv("./pump_price_for_gasoline_us_per_liter.csv")
gasolinePrices.head()
Out[31]:
| country | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | ... | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 1.05 | NaN | 1.15 | NaN | 1.28 | NaN | 1.07 | NaN | 0.7 |
| 1 | Angola | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.38 | NaN | ... | NaN | 0.53 | NaN | 0.65 | NaN | 0.63 | NaN | 0.76 | NaN | 0.97 |
| 2 | Albania | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.86 | NaN | ... | NaN | 1.36 | NaN | 1.46 | NaN | 1.81 | NaN | 1.76 | NaN | 1.36 |
| 3 | Andorra | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 1.24 | NaN | 1.49 | NaN | 1.67 | NaN | 1.51 | NaN | NaN |
| 4 | UAE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.23 | NaN | ... | NaN | 0.45 | NaN | 0.47 | NaN | 0.47 | NaN | 0.47 | NaN | 0.49 |
5 rows Ć 27 columns
InĀ [32]:
gasPricesLong = pd.melt(gasolinePrices, id_vars=['country'],
var_name='Year', value_name='GasPrice')
gasPricesLong.head()
Out[32]:
| country | Year | GasPrice | |
|---|---|---|---|
| 0 | Afghanistan | 1991 | NaN |
| 1 | Angola | 1991 | NaN |
| 2 | Albania | 1991 | NaN |
| 3 | Andorra | 1991 | NaN |
| 4 | UAE | 1991 | NaN |
InĀ [29]:
country_dropdown = alt.binding_select(options=[None] + sorted(gasPricesLong['country'].unique()), name='country ')
country_selection = alt.selection_point(fields=['country'], bind=country_dropdown, name="Select", empty=True)
explorationChart = alt.Chart(gasPricesLong).mark_point().encode(
x='Year:O',
y='GasPrice:Q',
color='country:N',
tooltip=['country', 'Year', 'GasPrice']
).transform_filter(
country_selection
).properties(
width=600,
height=400
).add_params(
country_selection
)
explorationChart
Out[29]:
InĀ [36]:
gasPricesLongSub = pd.merge(gasPricesLong, countryCodes[['name', 'sub-region']],
left_on='country', right_on='name', how='left')
gasPricesLongSub['GasPrice'] = pd.to_numeric(gasPricesLongSub['GasPrice'], errors='coerce')
gasPricesLongSub = gasPricesLongSub.groupby(['sub-region', 'Year'])['GasPrice'].mean().reset_index()
gasPricesLongSub['GasPrice'] = pd.to_numeric(gasPricesLongSub['GasPrice'], errors='coerce')
gasPricesLongSub.head()
Out[36]:
| sub-region | Year | GasPrice | |
|---|---|---|---|
| 0 | Australia and New Zealand | 1991 | NaN |
| 1 | Australia and New Zealand | 1992 | NaN |
| 2 | Australia and New Zealand | 1993 | NaN |
| 3 | Australia and New Zealand | 1994 | NaN |
| 4 | Australia and New Zealand | 1995 | 0.61 |
InĀ [43]:
sub_dropdown = alt.binding_select(options=[None] + sorted(gasPricesLongSub['sub-region'].unique()), name='Sub Region')
sub_selection = alt.selection_point(fields=['sub-region'], bind=sub_dropdown, name="Select", empty=True)
chart = alt.Chart(gasPricesLongSub).mark_line().encode(
x='Year:O',
y='GasPrice:Q',
color=alt.Color('sub-region:N', legend=None),
tooltip=['sub-region:N', 'Year:O', 'GasPrice:Q']
).transform_filter(
sub_selection
).properties(
title='SubRegion Average Gas Price By Year',
width=500,
height=500
).transform_filter(
alt.datum.GasPrice != None
).add_params(
sub_selection
)
chart
Out[43]:
World Values Survey
- How Many Respondents In Each Country
InĀ [44]:
worldValues = pd.read_csv("https://calvin-data304.netlify.app/data/wvs.csv")
worldValues.head()
Out[44]:
| sex | birth_year | birth_country_iso | age | age6 | age3 | married | married_before | country | COW_NUM | COW_ALPHA | democracy_importance | wave_chronology | ISO_country | S004 | respondent_number_orig | respondent_number_unified | weight | weight_equilibrated | survey_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 1975.0 | 9999 | 43 | 3 | 2 | 1 | -4 | AUS | 900 | AUL | 9 | 7 | 36 | -4 | 36071236 | 36720001 | 1.010623 | 0.551572 | 2018 |
| 1 | 1 | 1957.0 | 36 | 60 | 5 | 3 | 1 | -4 | AUS | 900 | AUL | 10 | 7 | 36 | -4 | 36070000 | 36720002 | 0.651305 | 0.551572 | 2018 |
| 2 | 1 | 1977.0 | 9999 | 41 | 3 | 2 | 1 | -4 | AUS | 900 | AUL | 6 | 7 | 36 | -4 | 36070001 | 36720003 | 1.116451 | 0.551572 | 2018 |
| 3 | 2 | 1974.0 | 9999 | 43 | 3 | 2 | 1 | -4 | AUS | 900 | AUL | 9 | 7 | 36 | -4 | 36070002 | 36720004 | 0.591649 | 0.551572 | 2018 |
| 4 | 2 | 1970.0 | 9999 | 48 | 4 | 2 | 1 | -4 | AUS | 900 | AUL | 10 | 7 | 36 | -4 | 36070003 | 36720005 | 1.589662 | 0.551572 | 2018 |
InĀ [51]:
respondentsCountry = worldValues.groupby('country').size().reset_index(name = "RespondentCount")
respondentsCountry = pd.merge(respondentsCountry, countryCodes[['alpha-3', 'name']],
left_on='country', right_on='alpha-3', how='left')
respondentsCountry.head()
Out[51]:
| country | RespondentCount | alpha-3 | name | |
|---|---|---|---|---|
| 0 | AUS | 1773 | AUS | Australia |
| 1 | CAN | 4018 | CAN | Canada |
| 2 | DEU | 1520 | DEU | Germany |
| 3 | GBR | 2399 | GBR | United Kingdom of Great Britain and Northern I... |
| 4 | KOR | 1245 | KOR | Korea, Republic of |
InĀ [63]:
respondentGraph = alt.Chart(respondentsCountry).mark_bar().encode(
x=alt.X('country:N', sort='-y', title=""),
y=alt.Y('RespondentCount:Q', title="Number of Respondents"),
color = alt.value('lightblue')
).properties(
title=alt.Title(text="Respondents by Country", align='right')
)
respondentGraph
Out[63]:
Canada had a greater number of respondents then the rest of the countries and then the US and Great Britain stood out but besides that the rest were relatively similar.
InĀ [74]:
worldValuesAge = worldValues[['country', 'age']]
alt.data_transformers.disable_max_rows()
box_plot = alt.Chart(worldValuesAge).mark_boxplot().encode(
x=alt.X('age:Q', title = "Age"),
y=alt.Y('country:N', title=""),
color=alt.value('lightblue')
).properties(
title=alt.TitleParams(
text='Age Distribution by Country',
align='right'
)
)
box_plot
Out[74]:
InĀ [78]:
worldValuesAge3 = worldValues[['country', 'age', 'age3']]
worldValuesAge3 = worldValuesAge3.groupby(['country', 'age3'])['age'].agg(['min', 'max']).reset_index()
worldValuesAge3.head()
Out[78]:
| country | age3 | min | max | |
|---|---|---|---|---|
| 0 | AUS | 1 | 17 | 29 |
| 1 | AUS | 2 | 30 | 49 |
| 2 | AUS | 3 | 50 | 98 |
| 3 | CAN | 1 | 18 | 29 |
| 4 | CAN | 2 | 30 | 49 |
InĀ [86]:
minMax3 = alt.Chart(worldValuesAge3).mark_bar().encode(
x=alt.X('country:N', title='Country'),
y=alt.Y('min:Q', title='Age'),
y2='max:Q',
color=alt.Color('age3:N', title="Grouping")
)
Out[86]:
InĀ [89]:
worldValuesAge6 = worldValues[['country', 'age', 'age6']]
worldValuesAge6 = worldValuesAge6.groupby(['country', 'age6'])['age'].agg(['min', 'max']).reset_index()
minMax6 = alt.Chart(worldValuesAge6).mark_bar().encode(
x=alt.X('country:N', title='Country'),
y=alt.Y('min:Q', title='Age'),
y2='max:Q',
color=alt.Color('age6:N', title="Grouping")
)
comparisonGraph = minMax3 | minMax6
comparisonGraph.properties(
title = alt.TitleParams(
text= "Age Groupings By Country"
)
)
Out[89]:
From this chart we can see that the age groups are the same no matter what country.
InĀ [129]:
worldValuesErrorBand = worldValues[['country', 'age6', 'democracy_importance']]
worldValuesErrorBand['response_10'] = worldValuesErrorBand.loc[:, 'democracy_importance'].apply(lambda x: 1 if x == 10 else 0)
base = alt.Chart(worldValuesErrorBand).encode(
x=alt.X("age6:O", sort='-x',title=""),
y=alt.Y("response_10:Q", title="")
)
error_band = base.mark_errorband(extent="ci")
lineChart = base.mark_line().encode(
y=alt.Y("mean(response_10):Q")
)
points = base.mark_circle(size=15).encode(
y=alt.Y("mean(response_10):Q")
)
combined_chart = error_band + points + lineChart
combined_chart.facet(
column = alt.Column("country:N", title="")).resolve_scale(y='independent').properties(
title="Percentage of people who say it is essential to live in a democracy by age group"
)
/var/folders/jz/n5m4rwz17fb2c7rywz1r25_h0000gn/T/ipykernel_47653/3103417639.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy worldValuesErrorBand['response_10'] = worldValuesErrorBand.loc[:, 'democracy_importance'].apply(lambda x: 1 if x == 10 else 0)
Out[129]:
InĀ [99]:
worldValuesErrorBand = worldValues[['country', 'age6', 'democracy_importance']]
worldValuesErrorBandAgg = worldValuesErrorBand.groupby(['country', 'age6'])['democracy_importance'].mean().reset_index()
worldValuesErrorBand.head()
Out[99]:
| country | age6 | democracy_importance | |
|---|---|---|---|
| 0 | AUS | 3 | 9 |
| 1 | AUS | 5 | 10 |
| 2 | AUS | 3 | 6 |
| 3 | AUS | 3 | 9 |
| 4 | AUS | 4 | 10 |
InĀ [118]:
base = alt.Chart(worldValuesErrorBand).encode(
x=alt.X("age6:O", sort='-x',title=""),
y=alt.Y("democracy_importance:Q", title="")
)
error_band = base.mark_errorband(extent="ci")
lineChart = base.mark_line().encode(
y=alt.Y("mean(democracy_importance):Q")
)
points = base.mark_circle(size=15).encode(
y=alt.Y("mean(democracy_importance):Q")
)
combined_chart = error_band + points + lineChart
combined_chart.facet(
column = alt.Column("country:N", title="")).resolve_scale(y='independent').properties(
title="Average Score(1-10) of people who say it is essential to live in a democracy by age group"
)
Out[118]:
InĀ [132]:
worldValuesErrorBand = worldValues[['age', 'country', 'democracy_importance']]
base = alt.Chart(worldValuesErrorBand).encode(
x=alt.X("age:Q", sort='-x',title=""),
y=alt.Y("democracy_importance:Q", title="")
)
error_band = base.mark_errorband(extent="ci")
lineChart = base.mark_line().encode(
y=alt.Y("mean(democracy_importance):Q")
)
points = base.mark_circle(size=15).encode(
y=alt.Y("mean(democracy_importance):Q")
)
combined_chart = error_band + points + lineChart
combined_chart.facet(
column = alt.Column("country:N", title="")).resolve_scale(y='independent').properties(
title="Average Score(1-10) of people who say it is essential to live in a democracy by age"
)
Out[132]:
This is worse too many points and overwhelming to read harder to see overall trends.
InĀ [153]:
worldValuesLOESS = worldValues[['age', 'country', 'democracy_importance']]
worldValuesLOESS = worldValuesLOESS.groupby(['age', 'country'])['democracy_importance'].mean().reset_index()
pointChart = alt.Chart(worldValuesLOESS).mark_point().encode(
x='age:Q',
y='democracy_importance:Q',
color='country:N'
)
loessLine = alt.Chart(worldValuesLOESS).transform_loess(
'age', 'democracy_importance', groupby=['country']
).mark_line(
size=2
).encode(
x='age:Q',
y='democracy_importance:Q',
color='country:N'
)
finalChart = pointChart + loessLine
finalChart.facet(column="country:N").properties(
title= alt.TitleParams(
text="LOESS Regression"
)
)
Out[153]:
InĀ [154]:
worldValuesLOESS = worldValues[['age', 'country', 'democracy_importance']]
worldValuesLOESS = worldValuesLOESS.groupby(['age', 'country'])['democracy_importance'].mean().reset_index()
pointChart = alt.Chart(worldValuesLOESS).mark_point().encode(
x='age:Q',
y='democracy_importance:Q',
color='country:N'
)
loessLine = alt.Chart(worldValuesLOESS).transform_regression(
'age', 'democracy_importance', groupby=['country'], method='linear'
).mark_line(
size=2
).encode(
x='age:Q',
y='democracy_importance:Q',
color='country:N'
)
finalChart = pointChart + loessLine
finalChart.facet(column="country:N").properties(
title= alt.TitleParams(
text="Linear Regression"
)
)
Out[154]:
InĀ [155]:
worldValuesLOESS = worldValues[['age', 'country', 'democracy_importance']]
worldValuesLOESS = worldValuesLOESS.groupby(['age', 'country'])['democracy_importance'].mean().reset_index()
pointChart = alt.Chart(worldValuesLOESS).mark_point().encode(
x='age:Q',
y='democracy_importance:Q',
color='country:N'
)
loessLine = alt.Chart(worldValuesLOESS).transform_regression(
'age', 'democracy_importance', groupby=['country'], method='poly'
).mark_line(
size=2
).encode(
x='age:Q',
y='democracy_importance:Q',
color='country:N'
)
finalChart = pointChart + loessLine
finalChart.facet(column="country:N").properties(
title= alt.TitleParams(
text="Polynomial Regression"
)
)
Out[155]: